from pipeline example described by Pierre-Luc Germain, course sta426 UZH

Loading necessary libraries

Preprocessing & Clustering

Loading the data

## data can be downloaded from : http://imlspenticton.uzh.ch/dump/files_for_levesque.tar

# set local path
local.path <- getwd()
setwd(local.path)

#define data we want to look at
sample_names <- list("patient1_HS", "patient1_SCC", "patient2_HS", "patient2_AK")
patient1_HS.path <- file.path("data", "patient1_HS")
patient1_SCC.path <- file.path("data", "patient1_SCC")
patient2_HS.path <- file.path("data", "patient2_HS")
patient2_AK.path <- file.path("data", "patient2_AK")

# read in filtered data and create list of SingleCellExperiment objects
paths <- list(patient1_HS.path, patient1_SCC.path, patient2_HS.path, patient2_AK.path)
sces <- lapply(paths, function(i) read10xCounts(file.path(i, "outs/filtered_feature_bc_matrix"), col.names = TRUE))
split_data <- function(sceo) {
  # Split the data, store ADT in alternative experiment
  sceo <- splitAltExps(sceo, rowData(sceo)$Type)
  
  # Coerce sparse matrix for ADT into a dense matrix
  counts(altExp(sceo)) <- as.matrix(counts(altExp(sceo)))
  sceo
}

sces <- lapply(sces, split_data)

# only select the gene expression data, drop the antibody capture
sce.patient1_HS <- sces[[1]]
sce.patient1_SCC <- sces[[2]]
sce.patient2_HS <- sces[[3]]
sce.patient2_AK <- sces[[4]]

# add name to sce for output graph
attr(sce.patient1_HS, "name") <- "Patient1 HS"
attr(sce.patient1_SCC, "name") <- "Patient1 SCC"
attr(sce.patient2_HS, "name") <- "Patient2 HS"
attr(sce.patient2_AK, "name") <- "Patient2 AK"

Normalization & reduction

ADD A NICE TABLE WITH ALL THE MARKER USED (and their source) AND EXPLAINED WHICH WEREN’T FOUND also explain that we decided to add the known markers from the litterature into the group of genes used to produce a low-dimensional representation of the data even when they were not part of the most variable genes

# run PCA

# using known genes
# genes not found in the data : "DCDN" "WISP2" "SEPP1" "TYP1" 
# genes linked to pericytevSMC : "ACTA2","TAGLN"
# were removed from the following list
known_markers <- c("DSC3","DSP","LGALS7","KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5","LUM","MMP2","SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1","RGS5","MYL9","TPM2","RERGL","PECAM1","SELE","CLDN5","VWF","PROX1","LYVE1","HLA-DRA","FCER1G","TYROBP","AIF1","CD1C","CD207","CD68","RNASE1","ITGAX","CD3D","CD3E","CD52","IL7R","DCT","MLANA","PMEL","CDH19","NGFR","UBE2C","PCNA")


sc_PCA <- function(sceo, hvgo, known_markers) {
  # only gives index of the first encountered
  known_genes <- rownames(sceo)[which(rowData(sceo)$Symbol %in% known_markers)]
  
  # Check that the genes we know are also part of the highly variable genes
  idx_notfound <- which(!(known_genes %in% hvgo))
  
  # print the markers that were not found in the data
  #known_markers[idx_notfound]
  
  # if some of the known markers were not selected, add them
  if (length(idx_notfound) > 0) {
    cat("Adding", known_markers[idx_notfound], "to the gene set used for PCA in", attr(sceo, "name"), "\n")
    hvgo <- c(hvgo, known_genes[idx_notfound])
  }
    
  # using highly variable genes
  sceo <- runPCA(sceo, subset_row=hvgo)
  
  # check the variance explained by the PCs:
  pc.var <- attr(reducedDim(sceo),"percentVar")
  plot(pc.var, xlab="PCs", ylab="% variance explained", main=paste("Variance explained across PCs for", attr(sceo, "name")))
  
  # restrict to the first 20 components:
  reducedDim(sceo) <- reducedDim(sceo)[,1:20]
  
  # run TSNE and UMAP based on the PCA:
  sceo <- runTSNE(sceo, dimred="PCA")
  sceo <- runUMAP(sceo, dimred="PCA")
}

sce.patient1_HS <- sc_PCA(sce.patient1_HS, hvg.patient1_HS, known_markers)
## Adding WIF1 IL7R to the gene set used for PCA in Patient1 HS

sce.patient1_SCC <- sc_PCA(sce.patient1_SCC, hvg.patient1_SCC, known_markers)
## Adding KRT10 CXCL2 TNN SFRP1 AIF1 CD3E NGFR to the gene set used for PCA in Patient1 SCC

sce.patient2_HS <- sc_PCA(sce.patient2_HS, hvg.patient2_HS, known_markers)
## Adding WIF1 AIF1 CD207 ITGAX CD3E to the gene set used for PCA in Patient2 HS

sce.patient2_AK <- sc_PCA(sce.patient2_AK, hvg.patient2_AK, known_markers)
## Adding SFRP1 FCER1G ITGAX CD3E to the gene set used for PCA in Patient2 AK

Clustering

sc_cluster <- function(sceo, k=30) {
  go <- buildKNNGraph(sceo, BNPARAM=AnnoyParam(), use.dimred="PCA", k=k)

  sceo$cluster <- as.factor(cluster_louvain(go)$membership)

  #plots <- plot_grid( plotTSNE(sceo, colour_by="cluster", text_by="cluster") + ggtitle(attr(sceo, "name")), plotUMAP(sceo, colour_by="cluster", text_by="cluster") + ggtitle(attr(sceo, "name")) ) 
  
  plots <- plot_grid( 
    plotTSNE(sceo, colour_by="cluster", text_by="cluster"), 
    plotUMAP(sceo, colour_by="cluster", text_by="cluster"))
  title <- ggdraw() + draw_label(attr(sceo, "name"), fontface='bold')
  plots <- plot_grid(title, plots, ncol=1, rel_heights=c(0.1, 1))
  print(plots)
  
  return(list(sceo, go))
}

results <- sc_cluster(sce.patient1_HS)

sce.patient1_HS <- results[[1]]
g.patient1_HS <- results[[2]]

results <- sc_cluster(sce.patient1_SCC)

sce.patient1_SCC <- results[[1]]
g.patient1_SCC <- results[[2]]

results <- sc_cluster(sce.patient2_HS)

sce.patient2_HS <- results[[1]]
g.patient2_HS <- results[[2]]

results <- sc_cluster(sce.patient2_AK)

sce.patient2_AK <- results[[1]]
g.patient2_AK <- results[[2]]

Cluster annotation

Known markers

# genes not found in our data : "DCDN" "WISP2" "SEPP1" "TYP1" 
# were removed from the list below
genes <- list(
  # classification: https://www.sciencedirect.com/science/article/pii/S0923181120301985?via%3Dihub#bib0050
  # KERATINOCYTE
  keratinocyte = c("DSC3","DSP","LGALS7"),
  #keratinocyte_basal = c("KRT5","KRT14","COL17A1"),
  #keratinocyte_suprabasal = c("KRT1","KRT10"),
  #keratinocyte_differentiated = c("LOR", "SPINK5"),
  #keratinocyte_ORS = c("KRT6B"),
  #keratinocyte_channel = c("GJB2","GJB6","ATP1B1"),
  #keratinocyte_sebaceous_gland = c("MGST1","FASN"),
  #keratinocyte_sweat_gland  = c("DCD","AQP5"),
  # FIBROBLAST
  fibroblast  = c("LUM","MMP2"),
  # fibroblast subclassification  from: https://www.nature.com/articles/s42003-020-0922-4#
  #fibroblast_secretory_reticular = c(SLPI","CTHRC1","MFAP5","TSPAN8"),
  #fibroblast_proinflammatory = c("CCL19","APOE","CXCL2","CXCL3","EFEMP1"),
  #fibroblast_secretory_papillary = c("APCDD1","ID1","WIF1","COL18A1","PTGDS"),
  #fibroblast_mesenchymal = c("ASPN","POSTN","GPC3","TNN","SFRP1"),
  # PERICYTE & vSMC
  #pericytevSMC  = c("ACTA2","TAGLN"), #vSMC : vascular smooth muscel cell
  #pericytevSMC_pericyte  = c("RGS5"),
  #pericytevSMC_vSMC  = c("MYL9","TPM2","RERGL"),
  # ENDOTHELIAL CELL
  endothelial  = c("PECAM1","SELE","CLDN5","VWF"),
  #endothelial_lymphatic  = c("PROX1","LYVE1"),
  # MYELOID CELL
  myeloid  = c("HLA-DRA","FCER1G","TYROBP","AIF1"),
  #myeloid_dendritic  = c("CD1C"),
  #myeloid_langerhans  = c("CD207"),
  #myeloid_macrophage = c("CD68","RNASE1","ITGAX"),
  # LYMPHOCYTE
  lymphocyte  = c("CD3D","CD3E","CD52","IL7R"),
  # MELANOCYTE
  melanocyte  = c("DCT","MLANA","PMEL"),
  # SCHWANN CELL
  schwann  = c("CDH19","NGFR"),
  # MITOTIC CELL
  mitotic  = c("UBE2C","PCNA")
)

convert_rownames <- function(sceo, go, genes) {
  return(paste(rownames(sceo), rowData(sceo)$Symbol, sep = "."))
}

annotate_cells <- function(sceo, go, genes) {
  kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rownames(sceo), value=TRUE))
  
  # TODO: try to modify this so that we don't have to convert the rownames anymore and use
  # the marker names save in rowData(sceo)$Symbol directly
  #kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rowData(sceo)$Symbol, value=TRUE))
  #print(kmo)
  
  return(list(sceo, kmo))
}


rownames(sce.patient1_HS) <- convert_rownames(sce.patient1_HS, g.patient1_HS, genes)
results <- annotate_cells(sce.patient1_HS, g.patient1_HS, genes)
sce.patient1_HS <- results[[1]]
km.patient1_HS <- results[[2]]

rownames(sce.patient1_SCC) <- convert_rownames(sce.patient1_SCC, g.patient1_SCC, genes)
results <- annotate_cells(sce.patient1_SCC, g.patient1_SCC, genes)
sce.patient1_SCC <- results[[1]]
km.patient1_SCC <- results[[2]]

rownames(sce.patient2_HS) <- convert_rownames(sce.patient2_HS, g.patient2_HS, genes)
results <- annotate_cells(sce.patient2_HS, g.patient2_HS, genes)
sce.patient2_HS <- results[[1]]
km.patient2_HS <- results[[2]]

rownames(sce.patient2_AK) <- convert_rownames(sce.patient2_AK, g.patient2_AK, genes)
results <- annotate_cells(sce.patient2_AK, g.patient2_AK, genes)
sce.patient2_AK <- results[[1]]
km.patient2_AK <- results[[2]]

Pseudo-bulk aggregation

# mean logcounts by cluster
pseudobulk <- function(sceo, kmo) {
  pbo <- aggregateData(sceo, "logcounts", by=c("cluster"), fun="mean")

  # TODO : find a way to delete the left annotation which is unreadable
  
  # build a heatmap of the mean logcounts of the known markers:
  h <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), annotation_names_row=F, cluster_rows=FALSE, scale="row", main=paste(attr(sceo, "name"),"before markers aggregation"), fontsize_row=6, fontsize_col=10, angle_col = "45")
  print(h)
  
  #--- aggregation markers
  # we will assign clusters to the cell type whose markers are the most expressed
  
  # we extract the pseudo-bulk counts of the markers for each cluster
  mato <- assay(pbo)[unlist(kmo),]
  
  # we aggregate across markers of the same type
  mato <- aggregate(t(scale(t(mato))), by=list(type=rep(names(kmo), lengths(kmo))), FUN=sum)
  
  # for each column (cluster), we select the row (cell type) which has the maximum aggregated value
  cl2o<- mato[,1][apply(mato[,-1], 2, FUN=which.max)]
  # we convert the cells' cluster labels to cell type labels:
  sceo$cluster2 <- cl2o[sceo$cluster]
  
  # we aggregate again to pseudo-bulk using the new clusters
  pbo <- aggregateData(sceo, "logcounts", by=c("cluster2"), fun="mean")
  
  # we plot again the expression of the markers as a sanity check
  
  # If we want to hide the gene markers show_rownames = FALSE
  h1 <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), annotation_names_row=F, cluster_rows=FALSE, scale="row", main=paste(attr(sceo, "name"),"after markers aggregation"), fontsize_row=6, fontsize_col=10, angle_col = "45")
  print(h1)
  
  # UMAP plot
  p <- plotUMAP(sceo, colour_by="cluster2", text_by="cluster2", point_size=1) + ggtitle(attr(sceo, "name")) + theme(legend.title=element_blank())
  print(p)
  
  return(list(sceo, pbo, mato, cl2o))
}

#Patient 1 HS
#km.patient1_HS
#km.patient1_SCC
#rowData(sce.patient1_HS[c("keratinocyte", "fibroblast", "endothelial", "myeloid", )])$Symbol

results <- pseudobulk(sce.patient1_HS, km.patient1_HS)

sce.patient1_HS <- results[[1]]
pb.patient1_HS <- results[[2]]
mat.patient1_HS <- results[[3]]
cl2.patient1_HS <- results[[4]]

#Patient 1 SCC
results <- pseudobulk(sce.patient1_SCC, km.patient1_SCC)

sce.patient1_SCC <- results[[1]]
pb.patient1_SCC <- results[[2]]
mat.patient1_SCC <- results[[3]]
cl2.patient1_SCC <- results[[4]]

#Patient 2 HS
results <- pseudobulk(sce.patient2_HS, km.patient2_HS)

sce.patient2_HS <- results[[1]]
pb.patient2_HS <- results[[2]]
mat.patient2_HS <- results[[3]]
cl2.patient2_HS <- results[[4]]

#Patient 2 AK
results <- pseudobulk(sce.patient2_AK, km.patient2_AK)

sce.patient2_AK <- results[[1]]
pb.patient2_AK <- results[[2]]
mat.patient2_AK <- results[[3]]
cl2.patient2_AK <- results[[4]]

Sub population

cluster_subtype <- function(sceo, cell_type, known_markers, subtype_markers) {
  # select the cell labeled as keratinocytes in the previous step
  sceo.sub <- sceo[,sceo$cluster2==cell_type]
  
  sceo.sub <- remove_rare_genes(sceo.sub, 4)
  results <- variance_stabilization(sceo.sub, 2000)
  
  sceo.sub <- results[[1]]
  hvgo.sub <- results[[2]]
  
  #---- run the pipeline again on that subdataset
  sceo.sub <- sc_PCA(sceo.sub, hvgo.sub, known_markers)
  
  #--- clustering
  
  results <- sc_cluster(sceo.sub)
  sceo.sub <- results[[1]]
  go.sub <- results[[2]]
  
  results <- annotate_cells(sceo.sub, go.sub, subtype_markers)
  
  sceo.sub <- results[[1]]
  kmo.sub <- results[[2]]
  
  #---
  
  results <- pseudobulk(sceo.sub, kmo.sub)
  
  sceo.sub <- results[[1]]
  pbo.sub <- results[[2]]
  mato.sub <- results[[3]]
  cl2o.sub <- results[[4]]
  
  return(sceo.sub)

}


known_markers.kera <- c("KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5")

#annotate the subclusters
  genes.kera <- list(
  # classification: https://www.sciencedirect.com/science/article/pii/S0923181120301985?via%3Dihub#bib0050
    # KERATINOCYTE
    #keratinocyte = c("DSC3","DSP","LGALS7"),
    basal = c("KRT5","KRT14","COL17A1"),
    suprabasal = c("KRT1","KRT10"),
    differentiated = c("LOR", "SPINK5"),
    ORS = c("KRT6B"),
    channel = c("GJB2","GJB6","ATP1B1"),
    sebaceous_gland = c("MGST1","FASN"),
    sweat_gland  = c("DCD","AQP5")
    )
  
  
# TODO : find a way to avoid giving both known_merkers.kera and genes.kera since both contain the 
  # same information in essence. Maybe will be fixed by the first todo
sce.patient1_HS.kera <- cluster_subtype(sce.patient1_HS, "keratinocyte", known_markers.kera, genes.kera)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Adding KRT5 KRT10 AQP5 to the gene set used for PCA in Patient1 HS

sce.patient1_SCC.kera <- cluster_subtype(sce.patient1_SCC, "keratinocyte", known_markers.kera, genes.kera)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |==========================================                            |  61%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Adding FASN to the gene set used for PCA in Patient1 SCC

sce.patient2_HS.kera <- cluster_subtype(sce.patient2_HS, "keratinocyte", known_markers.kera, genes.kera)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |===================                                                   |  26%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |===================================================                   |  74%
  |                                                                            
  |======================================================                |  76%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Adding FASN to the gene set used for PCA in Patient2 HS

sce.patient2_AK.kera <- cluster_subtype(sce.patient2_AK, "keratinocyte", known_markers.kera, genes.kera)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |===================                                                   |  26%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |=============================================                         |  65%
  |                                                                            
  |===============================================                       |  68%
  |                                                                            
  |=================================================                     |  71%
  |                                                                            
  |===================================================                   |  74%
  |                                                                            
  |======================================================                |  76%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%

# need the previous block to be runned (at least until l. 298)

genes.dyn <- list(
  # classification: https://www.sciencedirect.com/science/article/pii/S0092867420306723
  keratinocyte_basal = "COL17A1",
  keratinocyte_cycling = "MKI67",
  keratinocyte_differentiating = "KRT1"
  )

dynamic_plot <- function(sceo, go, genes) {
    kmo <- lapply(genes, FUN=function(go) grep(paste0(go, "$", collapse="|"), rownames(sceo), value=TRUE))
  
  # mean logcounts by cluster:
  pbo <- aggregateData(sceo, "logcounts", by=c("cluster"), fun="mean")
  lengths(kmo)
  h <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), cluster_rows=FALSE, scale="row", main="Before markers aggregation", fontsize_row=6, fontsize_col=10)
  h
  
  # we will assign clusters to the cell type whose markers are the most expressed
  # we extract the pseudo-bulk counts of the markers for each cluster
  mato <- assay(pbo)[unlist(kmo),]
  
  # we aggregate across markers of the same type
  mato <- aggregate(t(scale(t(mato))), by=list(type=rep(names(kmo), lengths(kmo))), FUN=sum)
  
  # for each column (cluster), we select the row (cell type) which has the maximum aggregated value
  cl3o <- mato[,1][apply(mato[,-1], 2, FUN=which.max)]
  
  # we convert the cells' cluster labels to cell type labels:
  sceo$cluster3 <- cl3o[sceo$cluster]
  
  print(sceo)
  
  # we aggregate again to pseudo-bulk using the new clusters
  pbo <- aggregateData(sceo, "logcounts", by=c("cluster3"), fun="mean")
  # we plot again the expression of the markers as a sanity check
  h1 <- pheatmap(assay(pbo)[unlist(kmo),], annotation_row=data.frame(row.names=unlist(kmo), type=rep(names(kmo), lengths(kmo))), split=rep(names(kmo), lengths(kmo)), cluster_rows=FALSE, scale="row", main="After markers aggregation", fontsize_row=6, fontsize_col=10) #, scale="row", main="After markers aggregation", fontsize_row=6, fontsize_col=10)
  h1
  plotUMAP(sceo, colour_by="cluster3", text_by="cluster3", point_size=1)
  
  
  #---- Histograms
  total <- ncol(sceo)
  basal <- ncol(sceo[,sceo$cluster3=="keratinocyte_basal"])
  cycling <- ncol(sceo[,sceo$cluster3=="keratinocyte_cycling"])
  diff <- ncol(sceo[,sceo$cluster3=="keratinocyte_differentiating"])
  counts <- c(basal, cycling, diff)
  percentage <- counts/total
  kera.dyn <- data.frame(names=c("Basal", "Cycling", "Differentiating"), percentage=percentage)
  kera.dyn
  
  p <- ggplot(data=kera.dyn, aes(x=names, y=percentage, fill=names)) + geom_bar(stat="identity") + labs(title=paste("Proportion of Keratinocyte states in", attr(sceo, "name")),x="Subtypes", y="Percentage") + ylim(0,1) + theme(legend.position='none')
  print(p)
  
}

dynamic_plot(sce.patient1_HS.kera, g.patient1_HS.kera, genes.dyn)
## class: SingleCellExperiment 
## dim: 17383 1354 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(17383): ENSG00000238009.AL627309.1 ENSG00000241860.AL627309.5
##   ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1354): AAACCTGCATAGAAAC-1 AAACCTGGTCTCATCC-1 ...
##   TTTGTCACATGGTCTA-1 TTTGTCAGTCCTAGCG-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture

dynamic_plot(sce.patient1_SCC.kera, g.patient1_SCC.kera, genes.dyn)
## class: SingleCellExperiment 
## dim: 16157 706 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16157): ENSG00000241860.AL627309.5 ENSG00000237491.LINC01409
##   ... ENSG00000277666.AC136352.3 ENSG00000273554.AC136616.1
## rowData names(3): ID Symbol Type
## colnames(706): AAACCTGGTAGATTAG-1 AAACGGGGTCCGTGAC-1 ...
##   TTTGCGCGTCGGCATC-1 TTTGGTTAGTCATGCT-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture

dynamic_plot(sce.patient2_HS.kera, g.patient2_HS.kera, genes.dyn)
## class: SingleCellExperiment 
## dim: 16623 1907 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16623): ENSG00000237491.LINC01409 ENSG00000228794.LINC01128
##   ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1907): AAACCTGGTCAAACTC-1 AAACGGGAGCCAGTAG-1 ...
##   TTTGTCATCTATCCCG-1 TTTGTCATCTCACATT-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture

dynamic_plot(sce.patient2_AK.kera, g.patient2_AK.kera, genes.dyn)
## class: SingleCellExperiment 
## dim: 16877 1106 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(16877): ENSG00000241860.AL627309.5 ENSG00000237491.LINC01409
##   ... ENSG00000278817.AC007325.4 ENSG00000277196.AC007325.2
## rowData names(3): ID Symbol Type
## colnames(1106): AAACCTGAGAGTTGGC-1 AAACCTGTCCTTGACC-1 ...
##   TTTGGTTGTGGTAACG-1 TTTGGTTTCTACCTGC-1
## colData names(5): Sample Barcode cluster cluster2 cluster3
## reducedDimNames(3): PCA TSNE UMAP
## altExpNames(1): Antibody Capture

known_markers.fibro <- c("SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1")

genes.fibro <- list(
   #fibroblast  = c("LUM","MMP2"),
#   # fibroblast subclassification  from: https://www.nature.com/articles/s42003-020-0922-4#
   secretory_reticular = c("SLPI","CTHRC1","MFAP5","TSPAN8"),
   proinflammatory = c("CCL19","APOE","CXCL2","CXCL3","EFEMP1"),
   secretory_papillary = c("APCDD1","ID1","WIF1","COL18A1","PTGDS"),
   mesenchymal = c("ASPN","POSTN","GPC3","TNN","SFRP1")
)

sce.patient1_HS.fibro <- cluster_subtype(sce.patient1_HS, "fibroblast", known_markers.fibro, genes.fibro)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
## Adding ID1 WIF1 SFRP1 to the gene set used for PCA in Patient1 HS

sce.patient1_SCC.fibro <- cluster_subtype(sce.patient1_SCC, "fibroblast", known_markers.fibro, genes.fibro)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%
## Adding ID1 to the gene set used for PCA in Patient1 SCC

sce.patient2_HS.fibro <- cluster_subtype(sce.patient2_HS, "fibroblast", known_markers.fibro, genes.fibro)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |======================================================================| 100%

sce.patient2_AK.fibro <- cluster_subtype(sce.patient2_AK, "fibroblast", known_markers.fibro, genes.fibro)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |===========================================================           |  85%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
## Adding CCL19 to the gene set used for PCA in Patient2 AK

# The following code was used to check which genes were missing from in the data
# the known_markers are from the litterature

data_markers <- rowData(sce.patient1_HS)$Symbol
known_markers <- c("DSC3","DSP","LGALS7","KRT5","KRT14","COL17A1","KRT1","KRT10","LOR", "SPINK5","KRT6B","GJB2","GJB6","ATP1B1","MGST1","FASN","DCD","AQP5","DCDN","LUM","MMP2","WISP2","SLPI","CTHRC1","MFAP5","TSPAN8","CCL19","APOE","CXCL2","CXCL3","EFEMP1","APCDD1","ID1","WIF1","COL18A1","PTGDS","ASPN","POSTN","GPC3","TNN","SFRP1","ACTA2","TAGLN","RGS5","MYL9","TPM2","RERGL","PECAM1","SELE","CLDN5","VWF","PROX1","LYVE1","HLA-DRA","FCER1G","TYROBP","AIF1","CD1C","CD207","CD68","RNASE1","SEPP1","ITGAX","CD3D","CD3E","CD52","IL7R","DCT","MLANA","PMEL","TYP1","CDH19","NGFR","UBE2C","PCNA")


idx_notfound <- which(!(known_markers %in% rowData(sce.patient1_HS)$Symbol))

print(paste("Done:", length(known_markers)-length(idx_notfound), "/", length(known_markers), "markers found"))
cat("Genes missing in dataset: ", known_markers[idx_notfound])